import pandas as pd
from functools import reduce
import json
from pathlib import Path



# # Data Pre-processing and Preparation
# 
# This notebook inputs the raw data collected in main_queries.ipynb (all_ss1_domains_engagement.feather) 
# and outputs the final dataset used in the main analysis.
# 
# There are four main pre-processing steps here:
# 
# 1) Filtering URLs based on engagement thresholds
# 2) Political classification
# 3) Removing domains with fewer than 10 URLs
# 4) Getting rid of negative metric counts

# 0) Loading Data
engagement_df = pd.read_feather('data/all_ss1_domains_engagement.feather')

# 1) Filtering Data
engagement_df = engagement_df[~engagement_df['political_page_affinity'].isna()].copy() 
total_df = pd.DataFrame(engagement_df[['clean_url', 'views', 'clicks', 'shares', 'reacts']].groupby('clean_url').sum())
threshold = total_df[(total_df['shares'] >= 1000) & (total_df['views'] >= 10000) & (total_df['clicks'] >= 5000) & (total_df['reacts'] >= 5000)].copy()
threshold_URLS = set(threshold.index)


engagement_df_thresholded = engagement_df[engagement_df['clean_url'].isin(threshold_URLS)].copy()
engagement_df_thresholded = engagement_df_thresholded.drop('level_0', axis = 1).copy()
engagement_df_thresholded = engagement_df_thresholded.drop('index', axis = 1).copy()

# 2) Political classification

def safe_load(x):
    try:
        return json.loads(x)
    except json.JSONDecodeError as e:
        print(e)
        return None


with open("data/public_blurbs_politics.jsonl", 'r') as file:
    json_list = [safe_load(line) for line in file]      
    json_list = [line for line in json_list if line is not None]


political_df = pd.DataFrame(json_list)
duplicated_urls = set(political_df[political_df.duplicated('url')]['url'])

t = political_df[(political_df['url'].isin(duplicated_urls)) & (political_df['url'] != 'www.google.com')]


small_df = political_df.groupby('url').agg({"politics_score": "mean"}).reset_index()
small_df['politics_label'] = [1 if score >= 0.9 else 0 for score in small_df['politics_score']]
small_df.rename(columns={'url':'clean_url'}, inplace=True) 


df_pol = engagement_df_thresholded.merge(small_df,how = 'left', on='clean_url').copy()
df_pol_only = df_pol[df_pol['politics_label'] ==1].copy()

# 3) Subsetting domains
domain_urls = df_pol_only.groupby('parent_domain').count()['clean_url'] / 5

eligible_domains = set(domain_urls[domain_urls >= 10].keys())
trim_df = df_pol_only[df_pol_only['parent_domain'].isin(eligible_domains)].copy()
trim_df.reset_index(inplace = True)


trim_df.drop(columns = ['index', 'politics_score', 'politics_label'], inplace = True)


# ## 4) Getting rid of negative metric counts
# 
# Because of SS1-added Gaussian noise, many counts in Condor are negative. Because of our engagement thresholds, all URLs included have nonzero counts in total, but specific political categories still may have negative counts. (This is even more likely because we only consider political URLs, and political URLs are likely to be partisan, and therefore lopsided.)
# 
# We don't want negative numbers making things weird (e.g. making political scores greater than 1, resulting in comparatively high variance in political scores between URLs). So we get rid of negative numbers by:
# 
# For each URL, and for each metric:
#    1. if any of the 5 political categories has a negative number  
#    2. Choose the negative number with the largest magnitude  
#    3.  Add that number to each political category  
#    4. (Now, there should be one category exactly 0)

urls_with_negative_counts = set(trim_df[trim_df['shares'] <0]['clean_url'])
negative_df = trim_df[trim_df['clean_url'].isin(urls_with_negative_counts)]

total_shares = negative_df.groupby('clean_url').sum()['shares']
most_negative_share = negative_df.groupby('clean_url')['shares'].min()


def non_negative(df, metric):
    # we need to turn the df into pivot table to do the non-negative stuff, then turn it back again.
    pivot = pd.pivot_table(df, index=['parent_domain', 'clean_url'], columns=['political_page_affinity'], 
                           values=metric, fill_value = 0)
    
    pivot['min'] = pivot.min(axis = 1)
    mask = pivot['min'] < 0 
    for i in [-2,-1,0,1,2]:
        pivot.loc[mask, i] += abs(pivot['min'])
        
    return pd.melt(pivot.reset_index(), id_vars = ['parent_domain', 'clean_url'], value_vars = [-2,-1,0,1,2]).rename(columns={'value': metric})


metrics = ['views', 'clicks', 'shares', 'reacts']
melt_dfs = [non_negative(trim_df, metric = metric) for metric in metrics]

final_df = reduce(lambda x, y: pd.merge(x,y, on = ['parent_domain', 'clean_url', 'political_page_affinity']), melt_dfs)

final_df.columns = ['parent_domain', 'clean_url', 'political_page_affinity', 'views',
       'clicks', 'shares', 'reacts']


final_df.to_feather('data/final_df.feather')




